Data Quality Check

Import libraries

library(bupaverse)
## Warning: package 'bupaverse' was built under R version 4.2.3
## 
## .______    __    __  .______      ___   ____    ____  _______ .______          _______. _______
## |   _  \  |  |  |  | |   _  \    /   \  \   \  /   / |   ____||   _  \        /       ||   ____|
## |  |_)  | |  |  |  | |  |_)  |  /  ^  \  \   \/   /  |  |__   |  |_)  |      |   (----`|  |__
## |   _  <  |  |  |  | |   ___/  /  /_\  \  \      /   |   __|  |      /        \   \    |   __|
## |  |_)  | |  `--'  | |  |     /  _____  \  \    /    |  |____ |  |\  \----.----)   |   |  |____
## |______/   \______/  | _|    /__/     \__\  \__/     |_______|| _| `._____|_______/    |_______|
##                                                                                                 
## ── Attaching packages ─────────────────────────────────────── bupaverse 0.1.0 ──
## ✔ bupaR         0.5.3     ✔ processcheckR 0.1.4
## ✔ edeaR         0.9.4     ✔ processmapR   0.5.3
## ✔ eventdataR    0.3.1
## Warning: package 'bupaR' was built under R version 4.2.3
## Warning: package 'processcheckR' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── bupaverse_conflicts() ──
## ✖ bupaR::filter()          masks stats::filter()
## ✖ processmapR::frequency() masks stats::frequency()
## ✖ edeaR::setdiff()         masks base::setdiff()
## ✖ bupaR::timestamp()       masks utils::timestamp()
## ✖ processcheckR::xor()     masks base::xor()
library(daqapo)
## Warning: package 'daqapo' was built under R version 4.2.3
## 
## Attaching package: 'daqapo'
## 
## The following object is masked from 'package:eventdataR':
## 
##     hospital
## 
## The following object is masked from 'package:utils':
## 
##     fix

Read XES

# event_log <- xesreadR::read_xes("../2_to_xes/mimicel.xes", validate = FALSE)

When import XES file into bupaR, it will prompt the following error messages. Hence, we use CSV file to assess the data quality of event log instead.

Error: cannot allocate vector of size 1.6 Gb
10. id(list(col_id, row_id), drop = FALSE)
9. spread.data.frame(., type, value)
8. spread(., type, value)
7. select(., -attr_id)
6. list2(...)
5. bind_cols(., eventlog)
4. select(., -n_attributes, -attr_id)
3. spread(., key, value)
2. all_attrs %>% unlist() %>% as_data_frame() %>% mutate(type = rep(c("key",
"value"), length = nrow(.)), attr_id = rep(1:(nrow(.)/2),
each = 2)) %>% spread(type, value) %>% select(-attr_id) %>%
bind_cols(eventlog) %>% select(-n_attributes, -attr_id) %>% ...
1. xesreadR::read_xes("../2_to_xes/mimicel.xes", validate = FALSE)

Read CSV

Import event log form csv

eventlog_df <- 
  read.csv('../2_to_xes/mimicel.csv', sep=",", na.strings = c("", " "))

Convert dataframe to event log and activity log , add activity_instance_id, add lifecycle_id

eventlog_df %>%
  bupaR::convert_timestamps(columns="timestamps", format = ymd_hms) %>%
  bupaR::mutate(resource_id = NA) %>%
  bupaR::mutate(lifecycle_id = "complete") %>%
  bupaR::mutate(activity_instance_id = as.numeric(row.names(.))) %>%
  bupaR::eventlog(case_id = "stay_id",
                  activity_id = "activity",
                  activity_instance_id = "activity_instance_id",
                  timestamp = "timestamps",
                  lifecycle_id = "lifecycle_id",
                  resource_id = "resource_id") -> event_log

# package `daqapo` requires `activitylog` for validating data quality
event_log %>% 
  bupaR::to_activitylog() -> activity_log
## Warning in to_activitylog.eventlog(.): No start events were found. Creating and
## initialising 'start' to NA.

Inspect event_log

Show identifiers for event_log

event_log %>% bupaR::mapping()
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Activity instance identifier:    activity_instance_id 
## Timestamp:           timestamps 
## Lifecycle transition:        lifecycle_id

Show activity, event, case, and trace of event_log

event_log %>% bupaR::n_activities()
## [1] 6
event_log %>% bupaR::n_events()
## [1] 7568824
event_log %>% bupaR::n_cases()
## [1] 425028
event_log %>% bupaR::n_traces()
## [1] 175671

Show unique activities

event_log %>% 
  bupaR::activities()
## # A tibble: 6 × 3
##   activity                absolute_frequency relative_frequency
##   <chr>                                <int>              <dbl>
## 1 Medicine reconciliation            2953118             0.390 
## 2 Medicine dispensations             1441839             0.190 
## 3 Vital sign check                   1423734             0.188 
## 4 Discharge from the ED               900077             0.119 
## 5 Enter the ED                        425028             0.0562
## 6 Triage in the ED                    425028             0.0562

Show unique traces

event_log %>% bupaR::traces()
## # A tibble: 175,671 × 3
##    trace                                   absolute_frequency relative_frequency
##    <chr>                                                <int>              <dbl>
##  1 Enter the ED,Triage in the ED,Vital si…               7108            0.0167 
##  2 Enter the ED,Triage in the ED,Discharg…               4994            0.0117 
##  3 Enter the ED,Triage in the ED,Vital si…               4244            0.00999
##  4 Enter the ED,Triage in the ED,Vital si…               3937            0.00926
##  5 Enter the ED,Triage in the ED,Vital si…               3216            0.00757
##  6 Enter the ED,Triage in the ED,Discharg…               2129            0.00501
##  7 Enter the ED,Triage in the ED,Vital si…               1696            0.00399
##  8 Enter the ED,Triage in the ED,Vital si…               1515            0.00356
##  9 Enter the ED,Triage in the ED,Vital si…               1431            0.00337
## 10 Enter the ED,Triage in the ED,Medicine…               1349            0.00317
## # ℹ 175,661 more rows

Data Quality Assessment

The table below summarizes the different data quality assessment tests available in daqapo, after which each test will be briefly demonstrated.

Function name Description Output
detect_activity_frequency_violations Function that detects activity frequency anomalies per case Summary in console + Returns activities in cases which are executed too many times
detect_attribute_dependencies Function detecting violations of dependencies between attributes (i.e. condition(s) that should hold when (an)other condition(s) hold(s)) Summary in console + Returns rows with dependency violations
detect_missing_values Function detecting missing values at different levels of aggregation Summary in console + Returns rows with NAs
detect_multiregistration Function detecting the registration of a series of events in a short time period for the same case or by the same resource Summary in console + Returns rows with multiregistration on resource or case level
detect_unique_values Function listing all distinct combinations of the given log attributes Summary in console + Returns all unique combinations of values in given columns
detect_value_range_violations Function detecting violations of the range of acceptable values Summary in console + Returns rows with value range infringements

Detect activity frequency anomalies

activity_log %>% daqapo::detect_activity_frequency_violations("Enter the ED" = 1, "Triage in the ED" = 1)
## *** OUTPUT ***
## For 0 cases in the activity log (0%) an anomaly is detected.

Detect Attributes Dependencies

Detect cases with disposition == "HOME" which hadm_id is NA

activity_log %>% 
  bupaR::filter(activity == "Discharge from the ED", (is.na(seq_num)|seq_num == 1) ) %>%
  daqapo::detect_attribute_dependencies(antecedent = (disposition == "HOME"),
                                        consequent = is.na(hadm_id))
## *** OUTPUT ***
## The following statement was checked: if condition(s) ~(disposition == "HOME") hold(s), then ~is.na(hadm_id) should also hold.
## This statement holds for 205129 (84.9%) of the rows in the activity log for which the first condition(s) hold and does not hold for 36497 (15.1%) of these rows.
## For the following rows, the first condition(s) hold(s), but the second condition does not:
## # Log of 241622 events consisting of:
## 1 trace 
## 241622 cases 
## 241622 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 241,622 × 35
##     stay_id subject_id  hadm_id activity          gender race  arrival_transport
##       <int>      <int>    <int> <chr>             <chr>  <chr> <chr>            
##  1 30000204   11615015 25540031 Discharge from t… <NA>   <NA>  <NA>             
##  2 30000252   18684072 28532292 Discharge from t… <NA>   <NA>  <NA>             
##  3 30000254   11158447       NA Discharge from t… <NA>   <NA>  <NA>             
##  4 30000262   19454512       NA Discharge from t… <NA>   <NA>  <NA>             
##  5 30000291   11212357       NA Discharge from t… <NA>   <NA>  <NA>             
##  6 30000389   11928692       NA Discharge from t… <NA>   <NA>  <NA>             
##  7 30000417   10275184       NA Discharge from t… <NA>   <NA>  <NA>             
##  8 30000443   12356587       NA Discharge from t… <NA>   <NA>  <NA>             
##  9 30000448   15135064       NA Discharge from t… <NA>   <NA>  <NA>             
## 10 30000479   19039924       NA Discharge from t… <NA>   <NA>  <NA>             
## # ℹ 241,612 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect cases with disposition == "ADMITTED" which hadm_id is NA

activity_log %>% 
  bupaR::filter((activity =="Discharge from the ED") & (is.na(seq_num) | seq_num == 1) ) %>%
  daqapo::detect_attribute_dependencies(antecedent = (disposition == "ADMITTED"),
                                        consequent = is.na(hadm_id))
## *** OUTPUT ***
## The following statement was checked: if condition(s) ~(disposition == "ADMITTED") hold(s), then ~is.na(hadm_id) should also hold.
## This statement holds for 384 (0.24%) of the rows in the activity log for which the first condition(s) hold and does not hold for 157626 (99.76%) of these rows.
## For the following rows, the first condition(s) hold(s), but the second condition does not:
## # Log of 158010 events consisting of:
## 1 trace 
## 158010 cases 
## 158010 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 158,010 × 35
##     stay_id subject_id  hadm_id activity          gender race  arrival_transport
##       <int>      <int>    <int> <chr>             <chr>  <chr> <chr>            
##  1 30000012   11714491 21562392 Discharge from t… <NA>   <NA>  <NA>             
##  2 30000038   13821532 26255538 Discharge from t… <NA>   <NA>  <NA>             
##  3 30000039   13340997 23100190 Discharge from t… <NA>   <NA>  <NA>             
##  4 30000177   17937834 23831044 Discharge from t… <NA>   <NA>  <NA>             
##  5 30000275   13297743 26874680 Discharge from t… <NA>   <NA>  <NA>             
##  6 30000317   13658097 23069398 Discharge from t… <NA>   <NA>  <NA>             
##  7 30000368   18563034 29198602 Discharge from t… <NA>   <NA>  <NA>             
##  8 30000379   15293245 21532833 Discharge from t… <NA>   <NA>  <NA>             
##  9 30000426   16592013 26871835 Discharge from t… <NA>   <NA>  <NA>             
## 10 30000492   15071337 27867822 Discharge from t… <NA>   <NA>  <NA>             
## # ℹ 158,000 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect Missing Values

Overview missing values for each column

activity_log %>%
  daqapo::detect_missing_values(level_of_aggregation = "overview")
## Selected level of aggregation:overview
## *** OUTPUT ***
## Absolute number of missing values per column:
##                             
## stay_id                    0
## subject_id                 0
## hadm_id              3015236
## activity                   0
## gender               7143796
## race                 7143796
## arrival_transport    7143796
## disposition          6668747
## seq_num              6669845
## icd_code             6669845
## icd_version          6669845
## icd_title            6669845
## temperature          6259715
## heartrate            5801023
## resprate             5822404
## o2sat                5864053
## sbp                  5812828
## dbp                  5813628
## pain                 6131510
## acuity               7150772
## chiefcomplaint       7143816
## rhythm               7515113
## name                 3173867
## gsn                  3206271
## ndc                  4615706
## etc_rn               4615706
## etccode              4627281
## etcdescription       4627281
## med_rn               6126985
## gsn_rn               6126985
## resource_id          7568824
## activity_instance_id       0
## .order                     0
## complete                   0
## start                7568824
## Relative number of missing values per column (expressed as percentage):
##                               
## stay_id                0.00000
## subject_id             0.00000
## hadm_id               39.83758
## activity               0.00000
## gender                94.38449
## race                  94.38449
## arrival_transport     94.38449
## disposition           88.10810
## seq_num               88.12261
## icd_code              88.12261
## icd_version           88.12261
## icd_title             88.12261
## temperature           82.70393
## heartrate             76.64365
## resprate              76.92614
## o2sat                 77.47641
## sbp                   76.79962
## dbp                   76.81019
## pain                  81.01008
## acuity                94.47666
## chiefcomplaint        94.38476
## rhythm                99.29037
## name                  41.93342
## gsn                   42.36155
## ndc                   60.98313
## etc_rn                60.98313
## etccode               61.13606
## etcdescription        61.13606
## med_rn                80.95029
## gsn_rn                80.95029
## resource_id          100.00000
## activity_instance_id   0.00000
## .order                 0.00000
## complete               0.00000
## start                100.00000
## Overview of activity log rows which are incomplete:
## # Log of 7568824 events consisting of:
## 425028 cases 
## 7568824 instances of 6 activities 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 7,568,824 × 35
##     stay_id subject_id  hadm_id activity          gender race  arrival_transport
##       <int>      <int>    <int> <chr>             <chr>  <chr> <chr>            
##  1 30000012   11714491 21562392 Vital sign check  <NA>   <NA>  <NA>             
##  2 30000012   11714491 21562392 Enter the ED      F      WHITE AMBULANCE        
##  3 30000012   11714491 21562392 Triage in the ED  <NA>   <NA>  <NA>             
##  4 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  5 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  6 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  7 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  8 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  9 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
## 10 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
## # ℹ 7,568,814 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect missing values for gender

activity_log %>% 
  bupaR::filter(activity=="Enter the ED") %>%
  daqapo::detect_missing_values(level_of_aggregation = "column", 
                                column = "gender")
## Selected level of aggregation:column
## *** OUTPUT ***
## Absolute number of missing values in columngender:0
## Relative number of missing values in columngender(expressed as percentage):0
## 
## Overview of activity log rows in whichgenderis missing:
## EMPTY EVENT LOG
## # A tibble: 0 × 35
## # ℹ 35 variables: stay_id <int>, subject_id <int>, hadm_id <int>,
## #   activity <chr>, gender <chr>, race <chr>, arrival_transport <chr>,
## #   disposition <chr>, seq_num <int>, icd_code <chr>, icd_version <int>,
## #   icd_title <chr>, temperature <dbl>, heartrate <dbl>, resprate <dbl>,
## #   o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>, acuity <dbl>,
## #   chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>, ndc <dbl>,
## #   etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>, …

Detect missing values for race

activity_log %>% 
  bupaR::filter(activity=="Enter the ED") %>%
  daqapo::detect_missing_values(level_of_aggregation = "column", 
                                column = "race")
## Selected level of aggregation:column
## *** OUTPUT ***
## Absolute number of missing values in columnrace:0
## Relative number of missing values in columnrace(expressed as percentage):0
## 
## Overview of activity log rows in whichraceis missing:
## EMPTY EVENT LOG
## # A tibble: 0 × 35
## # ℹ 35 variables: stay_id <int>, subject_id <int>, hadm_id <int>,
## #   activity <chr>, gender <chr>, race <chr>, arrival_transport <chr>,
## #   disposition <chr>, seq_num <int>, icd_code <chr>, icd_version <int>,
## #   icd_title <chr>, temperature <dbl>, heartrate <dbl>, resprate <dbl>,
## #   o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>, acuity <dbl>,
## #   chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>, ndc <dbl>,
## #   etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>, …

Detect missing values for arrival_transport

activity_log %>% 
  bupaR::filter(activity=="Enter the ED") %>%
  daqapo::detect_missing_values(level_of_aggregation = "column", 
                                column = "arrival_transport")
## Selected level of aggregation:column
## *** OUTPUT ***
## Absolute number of missing values in columnarrival_transport:0
## Relative number of missing values in columnarrival_transport(expressed as percentage):0
## 
## Overview of activity log rows in whicharrival_transportis missing:
## EMPTY EVENT LOG
## # A tibble: 0 × 35
## # ℹ 35 variables: stay_id <int>, subject_id <int>, hadm_id <int>,
## #   activity <chr>, gender <chr>, race <chr>, arrival_transport <chr>,
## #   disposition <chr>, seq_num <int>, icd_code <chr>, icd_version <int>,
## #   icd_title <chr>, temperature <dbl>, heartrate <dbl>, resprate <dbl>,
## #   o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>, acuity <dbl>,
## #   chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>, ndc <dbl>,
## #   etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>, …

Detect missing values for disposition

activity_log %>% 
  bupaR::filter(activity=="Discharge from the ED") %>%
  daqapo::detect_missing_values(level_of_aggregation = "column", 
                                column = "disposition")
## Selected level of aggregation:column
## *** OUTPUT ***
## Absolute number of missing values in columndisposition:0
## Relative number of missing values in columndisposition(expressed as percentage):0
## 
## Overview of activity log rows in whichdispositionis missing:
## EMPTY EVENT LOG
## # A tibble: 0 × 35
## # ℹ 35 variables: stay_id <int>, subject_id <int>, hadm_id <int>,
## #   activity <chr>, gender <chr>, race <chr>, arrival_transport <chr>,
## #   disposition <chr>, seq_num <int>, icd_code <chr>, icd_version <int>,
## #   icd_title <chr>, temperature <dbl>, heartrate <dbl>, resprate <dbl>,
## #   o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>, acuity <dbl>,
## #   chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>, ndc <dbl>,
## #   etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>, …

Detect missing values for acuity

activity_log %>% 
  bupaR::filter(activity=="Triage in the ED") %>%
  daqapo::detect_missing_values(level_of_aggregation = "column", 
                                column = "acuity")
## Selected level of aggregation:column
## *** OUTPUT ***
## Absolute number of missing values in columnacuity:6976
## Relative number of missing values in columnacuity(expressed as percentage):1.64130363176073
## 
## Overview of activity log rows in whichacuityis missing:
## # Log of 6976 events consisting of:
## 1 trace 
## 6976 cases 
## 6976 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 6,976 × 35
##     stay_id subject_id  hadm_id activity         gender race  arrival_transport
##       <int>      <int>    <int> <chr>            <chr>  <chr> <chr>            
##  1 30001785   16061348 28146972 Triage in the ED <NA>   <NA>  <NA>             
##  2 30003428   11889374 25084216 Triage in the ED <NA>   <NA>  <NA>             
##  3 30003505   11229262       NA Triage in the ED <NA>   <NA>  <NA>             
##  4 30003941   14334804       NA Triage in the ED <NA>   <NA>  <NA>             
##  5 30004017   13419676 29317041 Triage in the ED <NA>   <NA>  <NA>             
##  6 30004518   17237928 26689098 Triage in the ED <NA>   <NA>  <NA>             
##  7 30006274   16169853 29415170 Triage in the ED <NA>   <NA>  <NA>             
##  8 30007594   10554696 24910876 Triage in the ED <NA>   <NA>  <NA>             
##  9 30008125   18064328 22704256 Triage in the ED <NA>   <NA>  <NA>             
## 10 30011041   12155635 27459698 Triage in the ED <NA>   <NA>  <NA>             
## # ℹ 6,966 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect Multiregistration

Detect Multiregistration for activity Medicine reconciliation

activity_log %>%
  bupaR::filter(activity == "Medicine reconciliation") %>%
  daqapo::detect_multiregistration(level_of_aggregation = "case",
                                   timestamp = "complete",
                                   threshold_in_seconds = 61)
## Selected level of aggregation: case
## Selected timestamp parameter value: complete
## *** OUTPUT ***
## Multi-registration is detected for 270086 of the 304369 cases (88.74%) of the cases. These cases are:
## 
## For the following rows in the activity log, multi-registration is detected:
## # Log of 2902153 events consisting of:
## 270086 cases 
## 2902153 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 2,902,153 × 35
##     stay_id subject_id  hadm_id activity          gender race  arrival_transport
##       <int>      <int>    <int> <chr>             <chr>  <chr> <chr>            
##  1 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  2 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  3 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  4 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  5 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  6 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  7 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  8 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
##  9 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
## 10 30000012   11714491 21562392 Medicine reconci… <NA>   <NA>  <NA>             
## # ℹ 2,902,143 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect Multiregistration for activity Medicine dispensations

activity_log %>%
  bupaR::filter(activity == "Medicine dispensations") %>%
  daqapo::detect_multiregistration(level_of_aggregation = "case",
                                   timestamp = "complete",
                                   threshold_in_seconds = 61)
## Selected level of aggregation: case
## Selected timestamp parameter value: complete
## *** OUTPUT ***
## Multi-registration is detected for 210732 of the 295998 cases (71.19%) of the cases. These cases are:
## 
## For the following rows in the activity log, multi-registration is detected:
## # Log of 1076495 events consisting of:
## 210732 cases 
## 1076495 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 1,076,495 × 35
##     stay_id subject_id  hadm_id activity          gender race  arrival_transport
##       <int>      <int>    <int> <chr>             <chr>  <chr> <chr>            
##  1 30000012   11714491 21562392 Medicine dispens… <NA>   <NA>  <NA>             
##  2 30000012   11714491 21562392 Medicine dispens… <NA>   <NA>  <NA>             
##  3 30000017   14230614       NA Medicine dispens… <NA>   <NA>  <NA>             
##  4 30000017   14230614       NA Medicine dispens… <NA>   <NA>  <NA>             
##  5 30000017   14230614       NA Medicine dispens… <NA>   <NA>  <NA>             
##  6 30000017   14230614       NA Medicine dispens… <NA>   <NA>  <NA>             
##  7 30000017   14230614       NA Medicine dispens… <NA>   <NA>  <NA>             
##  8 30000017   14230614       NA Medicine dispens… <NA>   <NA>  <NA>             
##  9 30000038   13821532 26255538 Medicine dispens… <NA>   <NA>  <NA>             
## 10 30000038   13821532 26255538 Medicine dispens… <NA>   <NA>  <NA>             
## # ℹ 1,076,485 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect Multiregistration for activity Vital sign check

activity_log %>%
  filter(activity == "Vital sign check") %>%
  daqapo::detect_multiregistration(level_of_aggregation = "case",
                                   timestamp = "complete",
                                   threshold_in_seconds = 61)
## Selected level of aggregation: case
## Selected timestamp parameter value: complete
## *** OUTPUT ***
## Multi-registration is detected for 8666 of the 398828 cases (2.17%) of the cases. These cases are:
## 
## For the following rows in the activity log, multi-registration is detected:
## # Log of 20078 events consisting of:
## 29 traces 
## 8666 cases 
## 20078 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 20,078 × 35
##     stay_id subject_id  hadm_id activity         gender race  arrival_transport
##       <int>      <int>    <int> <chr>            <chr>  <chr> <chr>            
##  1 30000252   18684072 28532292 Vital sign check <NA>   <NA>  <NA>             
##  2 30000252   18684072 28532292 Vital sign check <NA>   <NA>  <NA>             
##  3 30001460   16680046 21068480 Vital sign check <NA>   <NA>  <NA>             
##  4 30001460   16680046 21068480 Vital sign check <NA>   <NA>  <NA>             
##  5 30001802   15750321 25185357 Vital sign check <NA>   <NA>  <NA>             
##  6 30001802   15750321 25185357 Vital sign check <NA>   <NA>  <NA>             
##  7 30002186   17728787 26340932 Vital sign check <NA>   <NA>  <NA>             
##  8 30002186   17728787 26340932 Vital sign check <NA>   <NA>  <NA>             
##  9 30007594   10554696 24910876 Vital sign check <NA>   <NA>  <NA>             
## 10 30007594   10554696 24910876 Vital sign check <NA>   <NA>  <NA>             
## # ℹ 20,068 more rows
## # ℹ 28 more variables: disposition <chr>, seq_num <int>, icd_code <chr>,
## #   icd_version <int>, icd_title <chr>, temperature <dbl>, heartrate <dbl>,
## #   resprate <dbl>, o2sat <dbl>, sbp <dbl>, dbp <dbl>, pain <chr>,
## #   acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>, name <chr>, gsn <int>,
## #   ndc <dbl>, etc_rn <int>, etccode <int>, etcdescription <chr>, med_rn <int>,
## #   gsn_rn <int>, resource_id <lgl>, activity_instance_id <dbl>, …

Detect Value Range Violations

Detect invalid value range of acuity

activity_log %>%
  bupaR::filter(activity == "Triage in the ED") %>%
  daqapo::detect_value_range_violations(acuity = domain_numeric(from=1, to=5))
## $acuity
## $type
## [1] "numeric"
## 
## $from
## [1] 1
## 
## $to
## [1] 5
## 
## attr(,"class")
## [1] "value_range" "list"
## *** OUTPUT ***
## The domain range for column acuity is checked.
## Values allowed between 1 and 5
## The values fall within the specified domain range for 418052 (98.36%) of the rows in the activity log and outside the domain range for 6976 (1.64%) of these rows.
## 
## The following rows fall outside the specified domain range for indicated column:
## # Log of 6976 events consisting of:
## 1 trace 
## 6976 cases 
## 6976 instances of 1 activity 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 6,976 × 36
##    column_checked  stay_id subject_id  hadm_id activity         gender race 
##    <chr>             <int>      <int>    <int> <chr>            <chr>  <chr>
##  1 acuity         30001785   16061348 28146972 Triage in the ED <NA>   <NA> 
##  2 acuity         30003428   11889374 25084216 Triage in the ED <NA>   <NA> 
##  3 acuity         30003505   11229262       NA Triage in the ED <NA>   <NA> 
##  4 acuity         30003941   14334804       NA Triage in the ED <NA>   <NA> 
##  5 acuity         30004017   13419676 29317041 Triage in the ED <NA>   <NA> 
##  6 acuity         30004518   17237928 26689098 Triage in the ED <NA>   <NA> 
##  7 acuity         30006274   16169853 29415170 Triage in the ED <NA>   <NA> 
##  8 acuity         30007594   10554696 24910876 Triage in the ED <NA>   <NA> 
##  9 acuity         30008125   18064328 22704256 Triage in the ED <NA>   <NA> 
## 10 acuity         30011041   12155635 27459698 Triage in the ED <NA>   <NA> 
## # ℹ 6,966 more rows
## # ℹ 29 more variables: arrival_transport <chr>, disposition <chr>,
## #   seq_num <int>, icd_code <chr>, icd_version <int>, icd_title <chr>,
## #   temperature <dbl>, heartrate <dbl>, resprate <dbl>, o2sat <dbl>, sbp <dbl>,
## #   dbp <dbl>, pain <chr>, acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>,
## #   name <chr>, gsn <int>, ndc <dbl>, etc_rn <int>, etccode <int>,
## #   etcdescription <chr>, med_rn <int>, gsn_rn <int>, resource_id <lgl>, …

Detect invalid value range of pain

activity_log %>%
  bupaR::filter(activity == "Triage in the ED" | activity == "Vital sign check") %>%
  bupaR::mutate(pain_num = as.numeric(pain)) %>%
  daqapo::detect_value_range_violations(pain_num = domain_numeric(from=0, to=10))
## Warning: There was 1 warning in `bupaR::mutate()`.
## ℹ In argument: `pain_num = as.numeric(pain)`.
## Caused by warning:
## ! NAs introduced by coercion
## $pain_num
## $type
## [1] "numeric"
## 
## $from
## [1] 0
## 
## $to
## [1] 10
## 
## attr(,"class")
## [1] "value_range" "list"
## *** OUTPUT ***
## The domain range for column pain_num is checked.
## Values allowed between 0 and 10
## The values fall within the specified domain range for 1304018 (70.53%) of the rows in the activity log and outside the domain range for 544744 (29.47%) of these rows.
## 
## The following rows fall outside the specified domain range for indicated column:
## # Log of 544744 events consisting of:
## 235810 cases 
## 544744 instances of 2 activities 
## 1 resource 
## Events occurred from NA until NA 
##  
## # Variables were mapped as follows:
## Case identifier:     stay_id 
## Activity identifier:     activity 
## Resource identifier:     resource_id 
## Timestamps:      start, complete 
## 
## # A tibble: 544,744 × 37
##    column_checked  stay_id subject_id  hadm_id activity         gender race 
##    <chr>             <int>      <int>    <int> <chr>            <chr>  <chr>
##  1 pain_num       30000039   13340997 23100190 Vital sign check <NA>   <NA> 
##  2 pain_num       30000039   13340997 23100190 Vital sign check <NA>   <NA> 
##  3 pain_num       30000039   13340997 23100190 Vital sign check <NA>   <NA> 
##  4 pain_num       30000112   13333760       NA Vital sign check <NA>   <NA> 
##  5 pain_num       30000202   15346940       NA Vital sign check <NA>   <NA> 
##  6 pain_num       30000202   15346940       NA Vital sign check <NA>   <NA> 
##  7 pain_num       30000202   15346940       NA Vital sign check <NA>   <NA> 
##  8 pain_num       30000204   11615015 25540031 Vital sign check <NA>   <NA> 
##  9 pain_num       30000204   11615015 25540031 Vital sign check <NA>   <NA> 
## 10 pain_num       30000204   11615015 25540031 Vital sign check <NA>   <NA> 
## # ℹ 544,734 more rows
## # ℹ 30 more variables: arrival_transport <chr>, disposition <chr>,
## #   seq_num <int>, icd_code <chr>, icd_version <int>, icd_title <chr>,
## #   temperature <dbl>, heartrate <dbl>, resprate <dbl>, o2sat <dbl>, sbp <dbl>,
## #   dbp <dbl>, pain <chr>, acuity <dbl>, chiefcomplaint <chr>, rhythm <chr>,
## #   name <chr>, gsn <int>, ndc <dbl>, etc_rn <int>, etccode <int>,
## #   etcdescription <chr>, med_rn <int>, gsn_rn <int>, resource_id <lgl>, …